# NumPy
import numpy as np
# Pandas
import pandas as pd
# Matplotlib
import matplotlib.pyplot as plt
# BeautifulSoup
from bs4 import BeautifulSoup
# Request
import requests
# RegEX
import re

!jupyter nbconvert --to html customer-segmentation-using-clustering.ipynb

#Getting the url 
url = "https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films"
req = requests.get(url)

#Checking the status
req

<Response [200]>

#Getting the content
soup = BeautifulSoup(req.content)

DATA CLEANING¶

Film = []
Year =[]
Award = []
Nomination = []
count = 0
for i in soup.findAll('td'):
  i = re.sub('^<td>.*">|<td>|</td>|<.*>|\n',"",str(i))
  if count == 0:
    Film.append(i)
    count += 1
  elif count == 1:
    Year.append(i)
    count += 1
  elif count == 2:
    Award.append(i)
    count += 1
  else:
    count = 0
    Nomination.append(i)

df = pd.DataFrame({"Film":Film[:1332], "Years":Year[:1332], "Awards":Award[:1332], "Nominations":Nomination[:1332]})
df

df.head(10)

df.tail(10)

	Film	Years	Awards	Nominations
0	Nomadland	21	3	6
1	The Father	21	2	6
2	Judas and the Black Messiah	21	2	6
3	Minari	21	1	6
4	Mank	21	2	10
...	...	...	...	...
1327	The Yankee Doodle Mouse	1943	1	1
1328	The Yearling	1946	2	7
1329	Yesterday, Today and Tomorrow	1964	1	1
1330	You Can't Take It with You	1938	2	7
1331	Zorba the Greek	1964	3	7

	Film	Years	Awards	Nominations
0	Nomadland	21	3	6
1	The Father	21	2	6
2	Judas and the Black Messiah	21	2	6
3	Minari	21	1	6
4	Mank	21	2	10
5	Sound of Metal	21	2	6
6	Ma Rainey's Black Bottom	21	2	5
7	Promising Young Woman	21	1	5
8	Tenet	21	1	2
9	Soul	21	2	3

	Film	Years	Awards	Nominations
1322	World Without Sun	1964	1	1
1323	Wrestling Swordfish	32	1	1
1324	Written on the Wind	1956	1	3
1325	Wuthering Heights	1939	1	8
1326	Yankee Doodle Dandy	1942	3	8
1327	The Yankee Doodle Mouse	1943	1	1
1328	The Yearling	1946	2	7
1329	Yesterday, Today and Tomorrow	1964	1	1
1330	You Can't Take It with You	1938	2	7
1331	Zorba the Greek	1964	3	7